In [ ]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import os
import h3
import itertools
from tqdm import tqdm
from multiprocessing import Pool
from scipy.stats import norm
from collections import defaultdict
from sklearn.cluster import AgglomerativeClustering

from mirrorverse.warehouse.utils import get_engine
from mirrorverse.chinook.states import spatial_key_to_index

pd.options.mode.chained_assignment = None

os.environ["DATABASE_URL"] = "sqlite:////workspaces/mirrorverse/mirrorverse.db"

Load the Data¶

In [ ]:
sql = '''
select 
    tag_key,
    date_key,
    depth,
    epoch
from 
    tag_depths
'''
depth = pd.read_sql(sql, get_engine())
print(depth.shape)
depth.head()
2024-05-14 13:29:25,205 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-14 13:29:25,206 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("
select 
    tag_key,
    date_key,
    depth,
    epoch
from 
    tag_depths
")
2024-05-14 13:29:25,206 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-14 13:29:25,209 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("
select 
    tag_key,
    date_key,
    depth,
    epoch
from 
    tag_depths
")
2024-05-14 13:29:25,210 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-14 13:29:25,211 INFO sqlalchemy.engine.Engine 
select 
    tag_key,
    date_key,
    depth,
    epoch
from 
    tag_depths

2024-05-14 13:29:25,213 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-14 13:29:28,119 INFO sqlalchemy.engine.Engine ROLLBACK
(1180821, 4)
Out[ ]:
tag_key date_key depth epoch
0 129843 1387411200 83.4 1387411200
1 129843 1387411200 80.7 1387412100
2 129843 1387411200 91.5 1387413000
3 129843 1387411200 91.5 1387413900
4 129843 1387411200 88.8 1387414800

Build or Load the Depth Classes¶

In [ ]:
def divide_among_classes(depth, depth_classes, min_value=0.01):
    sd = depth * 0.08 / 1.96 # ~two standard deviations gives our 95% confidence interval
    if sd == 0:
        division = np.zeros(len(depth_classes))
        division[0] == 1
    else:
        # we're going to assume the depth classes are sorted
        z = (depth_classes - depth) / sd
        division = norm.cdf(z)
        division[1:] = division[1:] - division[:-1]
        division[division < min_value] = 0
    return division

min_size = 0.01
depth_classes = np.array([25, 50, 75, 100, 150, 200, 250, 300, 400, 500])
depth_classes_hash = '_'.join([str(e) for e in depth_classes])
file_prefix = f'depth_classes_{depth_classes_hash}'

if not os.path.exists(f'depth/{file_prefix}.csv'):
    print('Adding Depth Classes...')
    max_rows = 1000000
    i = 0
    rows = []
    rows_scanned = 0
    print('Total Rows to Scan:', depth.shape[0], '...')
    for _, row in tqdm(depth.iterrows()):
        rows_scanned += 1
        membership = divide_among_classes(row['depth'], depth_classes, min_size)
        for depth_class, size in zip(depth_classes, membership):
            if size >= min_size:
                new_row = {k:v for k,v in row.items()}
                new_row['probability'] = size
                new_row['depth_class'] = depth_class
                rows.append(new_row)
        
        if len(rows) >= max_rows:
            pd.DataFrame(rows).to_csv(f'{file_prefix}_{i}.csv', index=False)
            rows = []
            i += 1

    if len(rows):
        pd.DataFrame(rows).to_csv(f'{file_prefix}_{i}.csv', index=False)
        i += 1

    print('Joining DataFrames...')
    dfs = []
    for j in range(i):
        df = pd.read_csv(f'{file_prefix}_{j}.csv')
        df['tag_key'] = df['tag_key'].astype(str)
        dfs.append(df)
    pd.concat(dfs).to_csv(f'depth/{file_prefix}.csv', index=False)

    for j in range(i):
        os.remove(f'{file_prefix}_{j}.csv')

depth = pd.read_csv(f'depth/{file_prefix}.csv')
depth['tag_key'] = depth['tag_key'].astype(str)
/tmp/ipykernel_3247/3790572311.py:56: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  depth = pd.read_csv(f'depth/{file_prefix}.csv')

Adding Context (Features)¶

In [ ]:
sql = '''
select 
    tt.*,
    h.home_region,
    e.elevation
from 
    tag_tracks tt 
    left join home_regions h
        on tt.tag_key = h.tag_key
    left join elevation e 
        on tt.h3_level_4_key = e.h3_level_4_key
'''
tt = pd.read_sql_query(
    sql,
    get_engine()
)
print(tt.head())
tt.head()
In [ ]:
depth = depth.merge(tt[['tag_key', 'date_key', 'longitude', 'latitude', 'home_region', 'elevation']])
depth.head()
In [ ]:
from suntimes import SunTimes

def get_sunrise(lat, lon, date):
    return SunTimes(longitude=lon, latitude=lat, altitude=0).risewhere(date, 'UTC').hour

def get_sunset(lat, lon, date):
    return SunTimes(longitude=lon, latitude=lat, altitude=0).setwhere(date, 'UTC').hour

depth['datetime'] = pd.to_datetime(depth['epoch'], utc=True, unit='s')
depth['date'] = depth['datetime'].dt.date
depth = depth[np.abs(depth['longitude']) <= 180]
depth['sunrise'] = depth.apply(
    lambda r: get_sunrise(r['latitude'], r['longitude'], r['date']), axis=1
)
depth['sunset'] = depth.apply(
    lambda r: get_sunset(r['latitude'], r['longitude'], r['date']), axis=1
)


depth['hour'] = depth['datetime'].dt.hour

depth['daytime'] = (depth['hour'] < depth['sunset']) | (depth['hour'] > depth['sunrise'])
depth.head()
In [ ]:
depth.to_csv(f'depth/{file_prefix}_context.csv', index=False)
In [ ]:
depth = pd.read_csv(f'depth/{file_prefix}_context.csv')
/tmp/ipykernel_3247/1546720778.py:1: DtypeWarning: Columns (0,8) have mixed types. Specify dtype option on import or set low_memory=False.
  depth = pd.read_csv(f'depth/{file_prefix}_context.csv')

Function for Creating Vectors¶

In [ ]:
def build_vectors(depth, depth_classes, group_columns, window_size):
    group_columns = list(group_columns) + ['window']
    depth['window'] = depth['date_key'] - depth['date_key'] % (window_size)
    df = depth.groupby(group_columns + ['depth_class']).agg({'probability': 'sum'}).reset_index()
    df['sum_probability'] = df.groupby(group_columns)[['probability']].transform('sum')
    df['overall_probability'] = df['probability'] / df['sum_probability']

    vectors = defaultdict(lambda: np.zeros(len(depth_classes)))
    indices = {
        v: i 
        for i, v in enumerate(depth_classes)
    }
    print(df.shape[0])
    for _, row in tqdm(df.iterrows()):
        key = tuple([row[key] for key in group_columns])
        vectors[key][indices[row['depth_class']]] = row['overall_probability']

    features = []
    directions = []
    for feature, direction in vectors.items():
        features.append(feature)
        directions.append(direction)

    return features, directions, depth.groupby(group_columns)[['epoch']].nunique().reset_index()

group_columns = ['tag_key', 'daytime']
window_size = 24 * 3600
features, vectors, counts = build_vectors(
    depth, depth_classes, group_columns, window_size
)
47632
47632it [00:01, 40700.02it/s]

Function for Building Distances¶

In [ ]:
def build_distance_matrix(vectors):
    vectors = [v / np.linalg.norm(v) for v in vectors]
    distances = np.zeros((len(vectors), len(vectors)))
    for i, v1 in tqdm(enumerate(vectors)):
        for j, v2 in enumerate(vectors):
            distance = 1 / (max(np.dot(v1, v2), 10 ** -6)) - 1
            distances[i, j] = distance
    return distances

print(len(vectors))
distances = build_distance_matrix(vectors)
13374
13374it [03:16, 68.14it/s]

The Clustering Itself¶

In [ ]:
N = 12

clustering = AgglomerativeClustering(n_clusters=N, metric="precomputed", connectivity=None, linkage="average")
clustering.fit(distances)

max_distance = 0
max_distances = {}
for label in range(N):
    distance = distances[clustering.labels_ == label,:][:,clustering.labels_ == label].max()
    max_distances[label] = distance
    max_distance = max(max_distance, distance)
print(max_distance)
print(pd.Series(clustering.labels_).value_counts() / len(clustering.labels_))
max_distances
7.8574058313038595
9     0.516525
0     0.136982
2     0.129131
5     0.082773
4     0.076417
1     0.024002
3     0.020936
8     0.006729
11    0.002542
6     0.001869
7     0.001495
10    0.000598
Name: count, dtype: float64
Out[ ]:
{0: 7.8574058313038595,
 1: 1.3485093761734581,
 2: 2.4139092413219165,
 3: 3.0454771796167295,
 4: 2.5301855767672485,
 5: 1.5620269473917259,
 6: 0.2799164592527059,
 7: 0.7085194189886073,
 8: 1.3337028235103614,
 9: 2.1271473408426465,
 10: 0.40261818435928065,
 11: 0.3643891176301597}

Add Class Labels to Depth Data¶

In [ ]:
rows = []
for feature_array, label in zip(features, clustering.labels_):
    row = {}
    for group, feature in zip(group_columns + ['window'], feature_array):
        row[group] = feature
    row['cluster'] = label
    rows.append(row)
clusters_df = pd.DataFrame(rows)

depth['window'] = depth['date_key'] - depth['date_key'] % window_size
clusters_df = depth.merge(clusters_df)
clusters_df['datetime'] = pd.to_datetime(clusters_df['epoch'], utc=True, unit='s')
clusters_df['tag_key'] = clusters_df['tag_key'].astype(str)
clusters_df['cluster'] = clusters_df['cluster'].astype(str)
print(clusters_df.shape)
clusters_df.head()
(1290863, 18)
Out[ ]:
tag_key date_key depth epoch probability depth_class longitude latitude home_region elevation datetime date sunrise sunset hour daytime window cluster
0 129843 1387411200 83.4 1387411200 0.993199 100 -166.922615 54.13176 NaN -184.870688 2013-12-19 00:00:00+00:00 2013-12-19 19 2 0 True 1387411200 5
1 129843 1387411200 80.7 1387412100 0.041772 75 -166.922615 54.13176 NaN -184.870688 2013-12-19 00:15:00+00:00 2013-12-19 19 2 0 True 1387411200 5
2 129843 1387411200 80.7 1387412100 0.958228 100 -166.922615 54.13176 NaN -184.870688 2013-12-19 00:15:00+00:00 2013-12-19 19 2 0 True 1387411200 5
3 129843 1387411200 91.5 1387413000 0.988571 100 -166.922615 54.13176 NaN -184.870688 2013-12-19 00:30:00+00:00 2013-12-19 19 2 0 True 1387411200 5
4 129843 1387411200 91.5 1387413000 0.011424 150 -166.922615 54.13176 NaN -184.870688 2013-12-19 00:30:00+00:00 2013-12-19 19 2 0 True 1387411200 5

View It!¶

In [ ]:
tag_key = '129843'
px.scatter(
    clusters_df[clusters_df['tag_key'] == tag_key], x='datetime', y='depth', color='cluster'
)
In [ ]:
vector_sums = defaultdict(lambda: np.zeros(len(vectors[0])))
sizes = defaultdict(int)
for label, vector in zip(clustering.labels_, vectors):
    vector_sums[label] += vector
    sizes[label] += 1
mean_vectors = {
    label: vector_sums[label] / sizes[label]
    for label in clustering.labels_
}
rows = []
for label, vector in mean_vectors.items():
    for depth_class, prob in zip(depth_classes, vector):
        rows.append({
            'cluster': str(label),
            'depth_class': depth_class,
            'probability': prob
        })
groups_df = pd.DataFrame(rows)
groups_df.head()
Out[ ]:
cluster depth_class probability
0 5 25 0.044002
1 5 50 0.045928
2 5 75 0.225012
3 5 100 0.549442
4 5 150 0.129686
In [ ]:
px.bar(
    groups_df, x='depth_class', y='probability', color='cluster', facet_row='cluster', width=800, height=1200
)

Modeling¶

In [ ]:
clusters_df['month'] = clusters_df['datetime'].dt.month
clusters_df.head()
Out[ ]:
tag_key date_key depth epoch probability depth_class longitude latitude home_region elevation datetime date sunrise sunset hour daytime window cluster month
0 129843 1387411200 83.4 1387411200 0.993199 100 -166.922615 54.13176 NaN -184.870688 2013-12-19 00:00:00+00:00 2013-12-19 19 2 0 True 1387411200 5 12
1 129843 1387411200 80.7 1387412100 0.041772 75 -166.922615 54.13176 NaN -184.870688 2013-12-19 00:15:00+00:00 2013-12-19 19 2 0 True 1387411200 5 12
2 129843 1387411200 80.7 1387412100 0.958228 100 -166.922615 54.13176 NaN -184.870688 2013-12-19 00:15:00+00:00 2013-12-19 19 2 0 True 1387411200 5 12
3 129843 1387411200 91.5 1387413000 0.988571 100 -166.922615 54.13176 NaN -184.870688 2013-12-19 00:30:00+00:00 2013-12-19 19 2 0 True 1387411200 5 12
4 129843 1387411200 91.5 1387413000 0.011424 150 -166.922615 54.13176 NaN -184.870688 2013-12-19 00:30:00+00:00 2013-12-19 19 2 0 True 1387411200 5 12
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold, PredefinedSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

tag_keys = clusters_df['tag_key'].unique()
train_keys = np.random.choice(tag_keys, int(0.8 * len(tag_keys)), replace=False)
test_keys = np.array(list(set(tag_keys) - set(train_keys)))



folds = 4
rows = []
np.random.shuffle(train_keys)
for i, tag_key in enumerate(train_keys):
    rows.append({
        'tag_key': tag_key,
        'fold': i % folds
    })
tag_folds = pd.DataFrame(rows)
tag_folds.head()
Out[ ]:
tag_key fold
0 202597 0
1 159006 1
2 205411 2
3 229206 3
4 172906 0
In [ ]:
df = (clusters_df['cluster'].value_counts() / clusters_df.shape[0]).reset_index()
allowable_clusters = set(df[df['count'] >= 0.05]['cluster'])
allowable_clusters
Out[ ]:
{'0', '2', '4', '5', '9'}
In [ ]:
from imblearn.over_sampling import RandomOverSampler

df = clusters_df.drop_duplicates(['tag_key', 'epoch'])
features = ['daytime', 'month', 'elevation', 'sunset', 'sunrise']
X_train = df[df['tag_key'].isin(train_keys) & df['cluster'].isin(allowable_clusters)][features + ['tag_key', 'cluster']].copy()
y_train = df[df['tag_key'].isin(train_keys) & df['cluster'].isin(allowable_clusters)]['cluster'].copy().astype(int)

print('Before Resampling:', X_train.shape)
old_shape = X_train.shape
X_train, y_train = RandomOverSampler().fit_resample(X_train, y_train)
print('After Resampling:', X_train.shape)

X_train = X_train.sample(old_shape[0])
print('Downsampling:', X_train.shape)

X_train = X_train.merge(tag_folds)
keys_train_array = X_train['tag_key']
fold = np.array(X_train['fold'])
y_train = X_train['cluster'].astype(int).reset_index(drop=True)
X_train = X_train[features].reset_index(drop=True)


X_test = df[df['tag_key'].isin(test_keys) & df['cluster'].isin(allowable_clusters)][features].copy()
y_test = df[df['tag_key'].isin(test_keys) & df['cluster'].isin(allowable_clusters)]['cluster'].copy().astype(int)

split = PredefinedSplit(fold)
model = GridSearchCV(
    estimator=RandomForestClassifier(
        bootstrap=False, n_jobs=(os.cpu_count() - 2)
    ),
    param_grid={"n_estimators": [20], "min_weight_fraction_leaf": [0.01, 0.05, 0.1]},#"min_samples_leaf": [10, 50, 100, 500, 1000, 5000]},
    return_train_score=True,
    cv=split,
    refit=True,
)
#StratifiedKFold(n_splits=5, shuffle=True, random_state=42),#KFold(n_splits=5, shuffle=True, random_state=42),

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

print('Evaluating...')

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
X_test_rebalanced, y_test_rebalanced = RandomOverSampler().fit_resample(X_test, y_test)
y_pred_test_rebalanced = model.predict(X_test_rebalanced)
print(
    round(accuracy_score(y_train, y_pred_train),3), 
    round(accuracy_score(y_test, y_pred_test), 3), 
    round(accuracy_score(y_test_rebalanced, y_pred_test_rebalanced), 3)
)
model.best_estimator_
Before Resampling: (764990, 7)
After Resampling: (2314610, 7)
Downsampling: (764990, 7)
Evaluating...
0.461 0.367 0.324
Out[ ]:
RandomForestClassifier(bootstrap=False, min_weight_fraction_leaf=0.01,
                       n_estimators=20, n_jobs=6)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(bootstrap=False, min_weight_fraction_leaf=0.01,
                       n_estimators=20, n_jobs=6)
In [ ]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test_rebalanced, y_pred_test_rebalanced)
(cm.T / cm.T.sum(axis=0)).T
Out[ ]:
array([[0.43260292, 0.17494706, 0.11424087, 0.19156541, 0.08664374],
       [0.12190799, 0.21133022, 0.22839914, 0.17887449, 0.25948816],
       [0.18460418, 0.27417659, 0.24567082, 0.23804875, 0.05749966],
       [0.22035655, 0.29741826, 0.13560518, 0.3165748 , 0.03004521],
       [0.12981542, 0.19273688, 0.1862938 , 0.07851854, 0.41263536]])
In [ ]:
cm = confusion_matrix(y_train, y_pred_train)
(cm.T / cm.T.sum(axis=0)).T
Out[ ]:
array([[0.56606852, 0.06999674, 0.08926591, 0.16788254, 0.1067863 ],
       [0.15751252, 0.32812938, 0.11422654, 0.13007939, 0.27005217],
       [0.18707267, 0.13257072, 0.35099807, 0.19103871, 0.13831984],
       [0.21707628, 0.09799928, 0.11027775, 0.46248898, 0.11215771],
       [0.15480477, 0.12474502, 0.0652355 , 0.05695152, 0.59826319]])
In [ ]:
cm = confusion_matrix(y_test, y_pred_test)
(cm.T / cm.T.sum(axis=0)).T
Out[ ]:
array([[0.43161726, 0.17659906, 0.11565263, 0.18939158, 0.08673947],
       [0.12113256, 0.21199972, 0.22683083, 0.18042152, 0.25961538],
       [0.1849387 , 0.27483946, 0.244892  , 0.23741973, 0.0579101 ],
       [0.22111925, 0.29806795, 0.13570953, 0.31472352, 0.03037975],
       [0.12981542, 0.19273688, 0.1862938 , 0.07851854, 0.41263536]])
In [ ]:
y_train.value_counts() / y_train.shape[0]
Out[ ]:
cluster
2    0.200705
0    0.200329
5    0.200258
4    0.199408
9    0.199301
Name: count, dtype: float64
In [ ]:
pd.Series(y_pred_train).value_counts() / y_pred_train.shape[0]
Out[ ]:
0    0.256641
9    0.244870
5    0.201801
2    0.150802
4    0.145886
Name: count, dtype: float64
In [ ]:
y_test.value_counts() / y_test.shape[0]
Out[ ]:
cluster
9    0.626015
2    0.132493
0    0.090401
4    0.080528
5    0.070562
Name: count, dtype: float64
In [ ]:
pd.Series(y_pred_test).value_counts() / y_pred_test.shape[0]
Out[ ]:
9    0.307362
2    0.207874
4    0.186428
0    0.166830
5    0.131506
Name: count, dtype: float64
In [ ]:
pd.Series(y_pred_test_rebalanced).value_counts() / y_pred_test_rebalanced.shape[0]
Out[ ]:
2    0.230122
0    0.217857
5    0.200716
4    0.182042
9    0.169262
Name: count, dtype: float64
In [ ]:
print(features)
model.best_estimator_.feature_importances_
['daytime', 'month', 'elevation', 'sunset', 'sunrise']
Out[ ]:
array([0.06200596, 0.23282532, 0.35349933, 0.12604526, 0.22562413])
In [ ]: